506 hw5

Author

Ruyue Jiang

repo link: https://github.com/gracejry/Stats506-_FA24.git

Problem 1

Create a class to represent rational numbers (numbers of the form a/b for integers a and b). Do this using S4.

  1. For the rational class, define the following:

    1. A constructor

    2. A validator that ensures the denominator is non-zero.

    3. A show method.

    4. A simplify method, to obtain the simplest form (e.g. simplify(2/4) produces 1/2).

    5. A quotient method (e.g. quotient(3/7) produces .42857143...). It should support a digits argument but only in the printing, not the returned result (Hint: what does print return?).

    6. Addition, subtraction, multiplication, division. These should all return a rational.

    7. You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.

#install.packages("Rcpp")
library(Rcpp)

cppFunction('
#include <numeric>

int gcd_r(int a, int b) {
    a = abs(a);
    b = abs(b);
    return std::gcd(a, b);
}

int lcm_r(int a, int b) {
    a = abs(a);
    b = abs(b);
    return abs(a * (b / std::gcd(a, b)));
}
')
#' Rational Number Class
#'
#' Represents rational numbers (fractions) with a numerator and a denominator.
#' Includes a validator to ensure validity of the rational number.
#'
#' @slot numerator Numeric. The numerator of the fraction.
#' @slot denominator Numeric. The denominator of the fraction (non-zero).
#'
#' @section Validation:
#' - The denominator cannot be zero.
#' - Both numerator and denominator must be numeric.
#'
# Define the "rational" S4 class
setClass(
    "rational",
    slots = list(
        numerator = "numeric",     # The numerator of the fraction
        denominator = "numeric"   # The denominator of the fraction
    ),
    # Validation Function
    validity = function(object) {
        # Check if denominator is zero
        if (object@denominator == 0) {
            stop("Error: Denominator cannot be zero.")  # Stop execution if invalid
        }
        # Check if numerator and denominator are numeric
        if (!is.numeric(object@numerator) || !is.numeric(object@denominator)) {
            stop("Error: Both numerator and denominator must be numeric.")
        }
        TRUE  # Object is valid if all checks pass
    }
)
#' Create a Rational Number
#'
#' A constructor function to create objects of the "rational" S4 class.
#' Automatically simplifies the fraction using the greatest common divisor (GCD).
#'
#' @param numerator Numeric. The numerator of the fraction.
#' @param denominator Numeric. The denominator of the fraction. Must not be zero.
#'
#' @return An object of class "rational".
#'
createRational <- function(numerator, denominator) {
    # Check for zero denominator
    if (denominator == 0) stop("Error: Denominator cannot be zero.")
    # Ensure denominator is positive
    if (denominator < 0) {
        numerator <- -numerator
        denominator <- -denominator
    }
    # Skip simplification for zero numerator
    if (numerator == 0) {
        return(new("rational", numerator = numerator, denominator = denominator))
    }
    # Simplify the fraction
    divisor <- gcd_r(numerator, denominator) # Find GCD of numerator and denominator
    new("rational", numerator = numerator / divisor, denominator = denominator / divisor)
}
#' Display a Rational Number
#'
#' A method to display an object of the "rational" class in a human-readable format.
#'
#' @param object An object of class "rational".
#'
#' @details
#' - If the numerator of the rational number is zero, the output will display as "0 / denominator".
#' - Otherwise, the output will display as "numerator/denominator".
#'
# Define the show method for "rational" class
setMethod(
    "show",
    "rational",
    function(object) {
        if (object@numerator == 0) {
            cat("0 /", object@denominator, "\n")  # Display as "0 / denominator"
        } else {
            cat(object@numerator, "/", object@denominator, "\n")  # Display as "numerator/denominator"
        }
    }
)
#' Simplify a Rational Number
#'
#' Simplifies an object of the "rational" class to its lowest terms.
#'
#' @param r An object of class "rational".
#'
#' @return A new "rational" object that represents the simplified form of the input.
#'
#' @details
#' The function simplifies a rational number by dividing the numerator and denominator by their greatest common divisor (GCD).
#'
# Function to simplify a rational number
simplify <- function(r) {
    divisor <- gcd_r(r@numerator, r@denominator)  # Find GCD of numerator and denominator
    createRational(r@numerator / divisor, r@denominator / divisor)  # Return simplified object
}
#' Arithmetic Operations for Rational Numbers
#'
#' Define addition, subtraction, multiplication, and division operations for objects of the "rational" class.
#'
#' @param e1 A rational number
#' @param e2 A rational number
#'
#' @return A new "rational" object that represents the result of the operation, simplified to its lowest terms.
#'
#' @details
#' The arithmetic operations are implemented using the following logic:
#' - Addition: Computes a common denominator and adds the numerators.
#' - Subtraction: Computes a common denominator and subtracts the numerators.
#' - Multiplication: Multiplies the numerators and denominators directly.
#' - Division: Multiplies the numerator of the first by the denominator of the second and vice versa. Division by zero is not allowed.
#'
#'
# Addition of two rational numbers
setMethod(
    "+",
    c("rational", "rational"),
    function(e1, e2) {
        # Compute new numerator and denominator
        numerator <- e1@numerator * e2@denominator + e2@numerator * e1@denominator
        denominator <- e1@denominator * e2@denominator
        simplify(createRational(numerator, denominator)) # Simplify the result
    }
)

# Subtraction of two rational numbers
setMethod(
    "-",
    c("rational", "rational"),
    function(e1, e2) {
        # Compute new numerator and denominator
        numerator <- e1@numerator * e2@denominator - e2@numerator * e1@denominator
        denominator <- e1@denominator * e2@denominator
        simplify(createRational(numerator, denominator)) # Simplify the result
    }
)

# Multiplication of two rational numbers
setMethod(
    "*",
    c("rational", "rational"),
    function(e1, e2) {
        # Compute new numerator and denominator
        numerator <- e1@numerator * e2@numerator
        denominator <- e1@denominator * e2@denominator
        simplify(createRational(numerator, denominator)) # Simplify the result
    }
)

# Division of two rational numbers
setMethod(
    "/",
    c("rational", "rational"),
    function(e1, e2) {
        if (e2@numerator == 0) stop("Error: Cannot divide by zero.") # Handle division by zero
        # Compute new numerator and denominator
        numerator <- e1@numerator * e2@denominator
        denominator <- e1@denominator * e2@numerator
        simplify(createRational(numerator, denominator)) # Simplify the result
    }
)
#' Compute the Decimal Value of a Rational Number
#'
#' This function computes the decimal (quotient) representation of a rational number.
#' Optionally, the decimal value can be rounded to a specified number of digits.
#'
#' @param r A rational number
#' @param digits An optional non-negative integer specifying the number of decimal places
#'   to round the quotient to. If `NULL`, the quotient is returned without rounding.
#'
#' @return The decimal representation of the rational number as a numeric value.
#'   If `digits` is specified, the rounded value is returned.
#'
#' @details
#' - If `digits` is not specified (`NULL`), the function returns the exact decimal value of the rational number.
#' - If `digits` is specified, it must meet the following conditions:
#'   1. Be a numeric value.
#'   2. Be a non-negative integer.
#'   3. Decimal and negative values for `digits` are not allowed.
#' - Any invalid `digits` argument will trigger an error.
#'
#'
# Function to compute the decimal (quotient) of a rational number
quotient <- function(r, digits = NULL) {
    value <- r@numerator / r@denominator # Compute the decimal value
    if (!is.null(digits)) {
        # Validate the 'digits' argument
        # Ensure 'digits' is numeric, avoid errors caused by passing non-numeric types like strings or logical values
        # Ensure 'digits' is an integer, non-integers are ambiguous and undefined
        # Ensure 'digits' is non-negative, Decimal places cannot be negative as it would not make logical sense
        if (!is.numeric(digits) || digits != as.integer(digits) || digits < 0) {
            stop("Error: 'digits' must be a non-negative integer.")
        }
        return(format(value, digits = digits))  # Return the formatted value directly
    }
    return(value)  # Return the unrounded value
}

b. Use your rational class to create three objects:

  • r1: 24/6

  • r2: 72/30

  • r3: 0/4

r1 <- createRational(24, 6)   # Represents 24 / 6
r2 <- createRational(7, 230)  # Represents 7 / 230
r3 <- createRational(0, 4)    # Represents 0 / 4
r1
4 / 1 
r3
0 / 4 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in r2/r3: Error: Cannot divide by zero.
quotient(r1)
[1] 4
quotient(r2)
[1] 0.03043478
quotient(r2, digits = 3)
[1] "0.0304"
quotient(r2, digits = 3.14)
Error in quotient(r2, digits = 3.14): Error: 'digits' must be a non-negative integer.
quotient(r2, digits = "avocado")
Error in quotient(r2, digits = "avocado"): Error: 'digits' must be a non-negative integer.
q2 <- quotient(r2, digits = 3)
q2
[1] "0.0304"
quotient(r3)
[1] 0
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 

c. Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.

Note that there are a lot of choices to be made here. How are you going to store the class? Two numerics? A vector of length two? A formula? A string? What are users going to pass into the constructor? A string (“24/6”)? Two arguments? A vector?

There is no right answer to those questions. Make the best decision you can, and don’t be afraid to change it if your decision causes unforeseen difficulties.

You may not use any existing R functions or packages that would trivialize this assignment. (E.g. if you found an existing package that does this, or found a function that automatically produces the quotient or simplified version, that is not able to be used.)

Hint: It may be useful to define other functions that I don’t explicitly ask for.

# Case 1: Valid numeric inputs
createRational(24, 6) # Should simplify to 4/1
4 / 1 
# Case 2: Invalid numeric input: Zero denominator
createRational(24, 0) # Expect: "Error: Denominator cannot be zero."
Error in createRational(24, 0): Error: Denominator cannot be zero.
# Case 3: Missing numerator or denominator
createRational(24)  # Should throw an error due to missing arguments: "Error in createRational: argument is missing, with no default."
Error in createRational(24): argument "denominator" is missing, with no default
createRational()    # Should throw an error due to missing arguments: "Error in createRational: argument is missing, with no default."
Error in createRational(): argument "denominator" is missing, with no default
# Case 4: Empty vector
empty_vec <- c()
createRational(empty_vec[1], empty_vec[2])  # Should throw "Error: Missing numerator or denominator."
Error in if (denominator == 0) stop("Error: Denominator cannot be zero."): argument is of length zero
# Case 5: Non-numeric vector
createRational("24", "6")  # Should throw "Error: Both numerator and denominator must be numeric."
Error in eval(expr, envir, enclos): Not compatible with requested type: [type=character; target=integer].
createRational(24, "six")  # Should throw "Error: Both numerator and denominator must be numeric."
Error in eval(expr, envir, enclos): Not compatible with requested type: [type=character; target=integer].
# Case 6: Negative denominator
createRational(5, -3) # should simplify to a positive denominator
-5 / 3 
# Case 7: Numerator is zero
createRational(0, 10) # should simplify to 0/10
0 / 10 
# Case 8: Division by zero rational number
r4 <- createRational(1, 2)
r5 <- createRational(0, 1) # This rational number represents 0
r4 / r5 # Expect: "Error: Cannot divide by zero."
Error in r4/r5: Error: Cannot divide by zero.
# Case 9: Invalid 'digits' argument
# non-integer
quotient(createRational(1, 3), digits = "three")  # Expect: "Error: 'digits' must be a non-negative integer."
Error in quotient(createRational(1, 3), digits = "three"): Error: 'digits' must be a non-negative integer.
# negative integer
quotient(createRational(1, 3), digits = -1)  # Expect: "Error: 'digits' must be a non-negative integer."
Error in quotient(createRational(1, 3), digits = -1): Error: 'digits' must be a non-negative integer.

Problem 2

Let’s revisit the art data from the last problem set. Use plotly for these.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("plotly")
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
art_sales <- read_csv("~/Downloads/df_for_ml_improved_new_market.csv")
Rows: 4347 Columns: 112
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): eventdate
dbl (111): id, case_id, year, height, width, size_inchsqr, price_usd, meanpr...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  1. Regenerate your plot which addresses the second question from last time:

    1. Does the distribution of genre of sales across years appear to change?

    You may copy your plot from last time, or copy my plot from the solutions, or come up with your own new plot.

# Data preperation
# Add Genre___Multiple column
art_sale1 <- art_sales %>%
  mutate(Genre___Multiple = if_else(
    Genre___Photography + Genre___Print + Genre___Sculpture + Genre___Painting + Genre___Others > 1,
    1, # If an artwork falls into more than one genre, set 'Genre___Multiple' to 1
    0 # Otherwise, set it to 0
  ))


# Transform the genre columns into a long format for easier analysis
art_genres <- art_sale1 %>% 
              pivot_longer(cols = starts_with("Genre___"), # Select all columns that start with "Genre___"
                           names_to = "genre", # Create a new column called 'genre' to store genre names
                           values_to = "is_genre" ) %>% # Create a new column called 'is_genre' to indicate genre presence
              filter(is_genre == 1) %>% # Filter to keep only rows where a genre is present
              mutate(genre = gsub("Genre___", "", genre)) # Remove the "Genre___" prefix for cleaner genre names
art_genres
# A tibble: 5,385 × 109
      id case_id  year height width size_inchsqr price_usd meanprice_year
   <dbl>   <dbl> <dbl>  <dbl> <dbl>        <dbl>     <dbl>          <dbl>
 1     0   57649  1997     29    24          696      4160         247.  
 2     0   57649  1997     29    24          696      4160         247.  
 3     0   57649  1997     29    24          696      4160         247.  
 4     1   30468  1997     17    14          238      2340          13.9 
 5     2   85464  1997     28    22          616      3640          26.5 
 6     3   27308  1997     32    39         1248     10832          18.2 
 7     3   27308  1997     32    39         1248     10832          18.2 
 8     3   27308  1997     32    39         1248     10832          18.2 
 9     4   82202  1997     46    37         1702     13210           5.26
10     5   60932  1997     50    43         2150      3434          30.4 
# ℹ 5,375 more rows
# ℹ 101 more variables: min_price <dbl>, max_price <dbl>,
#   medianprice_year <dbl>, cnt_mean <dbl>, cnt_max <dbl>, cnt_median <dbl>,
#   cot_mean <dbl>, cot_max <dbl>, cot_median <dbl>, ranking <dbl>,
#   fest_biennal <dbl>, private_inst <dbl>, public_inst <dbl>, solo_show <dbl>,
#   group_show <dbl>, age <dbl>, estimate_min_usd <dbl>,
#   estimate_max_usd <dbl>, estimate_center_usd <dbl>, …
# The plot from last problem set, using ggplot
ggplot(art_genres, aes(x = factor(year), fill = genre)) +
  geom_bar(position = "stack", alpha = .75) + # Use stacked bars to show genre proportions within each year and add transparency with alpha = 0.75
  labs(title = "Distribution of Art Sales Genres Over Time",
       x = "Year",
       y = "Count of Sales",
       fill = "Genre") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
        plot.title = element_text(hjust = 0.5, face = "bold"))

# Create a stacked bar chart with plotly
plotly_genre_plot <- art_genres %>%
  count(year, genre) %>%  # Aggregate counts for each year and genre
  plot_ly(
    x = ~factor(year),  # Use year on the x-axis (converted to factor for categorical grouping)
    y = ~n,             # Use count on the y-axis
    color = ~genre,     # Use genre to define colors
    type = "bar",       # Create a bar chart
    text = ~paste("Genre:", genre, "<br>Year:", year, "<br>Count:", n),  # Add hover text
    hoverinfo = "text"  # Show only custom hover text
  ) %>%
  layout(
    title = "Distribution of Art Sales Genres Over Time",
    barmode = "stack",  # Stack the bars to show proportions
    xaxis = list(title = "Year"),
    yaxis = list(title = "Count of Sales"),
    legend = list(title = list(text = "Genre"))
  )

plotly_genre_plot

This stacked bar chart shows the distribution of art sales across genres over time. Overall, the total count of sales has grown steadily, with a sharp increase around 2011–2012. Photography consistently dominates the genre distribution, contributing significantly to sales in most years. Sculpture and Print also maintain notable shares, especially during the later years. The growth in the Others and Multiple categories towards the end of the timeline suggests diversification in genres or a broader categorization of artworks in recent years. This trend reflects both increasing sales activity and shifts in genre popularity over time.

b. Generate an interactive plot with plotly that can address both of these questions from last time:

  1. Is there a change in the sales price in USD over time?

  2. How does the genre affect the change in sales price over time?

This should be a single interactive plot, with which a user can manipulate the view to be able to look at change over time overall, or by genre.

# Prepare data for average sales price by year and genre
avg_sales_by_genre <- art_genres %>%
                      group_by(year, genre) %>%
                      summarize(avg_price = mean(price_usd, na.rm = TRUE), .groups = "drop")
# Create the interactive plot
interactive_plot <- plot_ly(
  data = avg_sales_by_genre,
  x = ~year,                # Year on the x-axis
  y = ~avg_price,           # Average price on the y-axis
  color = ~genre,           # Different colors for genres
  type = "scatter",         # Create a scatter plot
  mode = "lines+markers",   # Add both lines and markers for better visibility
  text = ~paste("Genre:", genre, "<br>Year:", year, "<br>Avg Price:", round(avg_price, 2)), # Hover text
  hoverinfo = "text"        # Display only custom hover text
) %>%
  layout(
    title = "Change in Sales Price Over Time by Genre",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average Price (USD)"),
    legend = list(title = list(text = "Genre"))
  )

interactive_plot

The interactive plot reveals that average sales prices across art genres have fluctuated significantly over time, with a notable peak between 2005 and 2010, followed by a decline and stabilization post-2010. Photography experienced the most dramatic spikes, particularly around 2007–2009, likely driven by high-value sales, before declining sharply. Print and Sculpture show steady growth until 2007, after which prices slightly decline or stabilize, while Painting exhibits consistent pricing trends throughout the timeline. The Multiple and Others categories display sporadic peaks but lack consistent trends. These patterns suggest a dynamic market during the mid-2000s, potentially influenced by increased demand, high-profile sales, or external economic factors.

Problem 3

Repeat problem set 4, question 1, using data.table.

library(nycflights13)
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:dplyr':

    between, first, last
The following object is masked from 'package:purrr':

    transpose
  1. Generate a table (which can just be a nicely printed tibble) reporting the mean and median departure delay per airport. Generate a second table (which again can be a nicely printed tibble) reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights. Do this exclusion through code, not manually.

    Additionally,

    • Order both tables in descending mean delay.

    • Both tables should use the airport names not the airport codes.

    • Both tables should print all rows.

flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)
# Calculate departure delays
depart_delay <- flights_dt[
  !is.na(dep_delay), # Exclude rows where dep_delay is NA
  .(
    mean_dep_delay = mean(dep_delay, na.rm = TRUE), # Mean departure delay
    median_dep_delay = median(dep_delay, na.rm = TRUE), # Median departure delay
    num_flight = .N # Count flights
  ),
  by = origin # Group by origin airport
][
  num_flight >= 10 # Keep only origins with at least 10 flights
][
  airports_dt, on = .(origin = faa) # Join with airport names
][
  !is.na(name) # Ensure airport names are not NA after the join
][
  order(-mean_dep_delay) # Order by descending mean departure delay
][
  , .(dept_name = name, mean_dep_delay, median_dep_delay) # Select relevant columns
]

# Remove any rows with NA values explicitly
depart_delay <- depart_delay[complete.cases(depart_delay)]

print(depart_delay)
             dept_name mean_dep_delay median_dep_delay
                <char>          <num>            <num>
1: Newark Liberty Intl       15.10795               -1
2: John F Kennedy Intl       12.11216               -1
3:          La Guardia       10.34688               -3
# Calculate arrival delays
arrival_delay <- flights_dt[
  !is.na(arr_delay), # Exclude rows where arr_delay is NA
  .(
    mean_arr_delay = mean(arr_delay, na.rm = TRUE), # Mean arrival delay
    median_arr_delay = median(arr_delay, na.rm = TRUE), # Median arrival delay
    num_flight = .N # Count flights
  ),
  by = dest # Group by destination airport
][
  num_flight >= 10 # Keep only destinations with at least 10 flights
][
  airports_dt, on = .(dest = faa) # Join with airport names
][
  !is.na(name) # Ensure airport names are not NA after the join
][
  order(-mean_arr_delay) # Order by descending mean arrival delay
][
  , .(arr_name = name, mean_arr_delay, median_arr_delay) # Select relevant columns
]

# Remove any rows with NA values explicitly
arrival_delay <- arrival_delay[complete.cases(arrival_delay)]

print(arrival_delay)
                                arr_name mean_arr_delay median_arr_delay
                                  <char>          <num>            <num>
 1:                Columbia Metropolitan    41.76415094             28.0
 2:                           Tulsa Intl    33.65986395             14.0
 3:                    Will Rogers World    30.61904762             16.0
 4:                 Jackson Hole Airport    28.09523810             15.0
 5:                        Mc Ghee Tyson    24.06920415              2.0
 6:               Dane Co Rgnl Truax Fld    20.19604317              1.0
 7:                        Richmond Intl    20.11125320              1.0
 8:        Akron Canton Regional Airport    19.69833729              3.0
 9:                      Des Moines Intl    19.00573614              0.0
10:                   Gerald R Ford Intl    18.18956044              1.0
11:                      Birmingham Intl    16.87732342             -2.0
12:         Theodore Francis Green State    16.23463687              1.0
13: Greenville-Spartanburg International    15.93544304             -0.5
14:    Cincinnati Northern Kentucky Intl    15.36456376             -3.0
15:            Savannah Hilton Head Intl    15.12950601             -1.0
16:          Manchester Regional Airport    14.78755365             -3.0
17:                          Eppley Afld    14.69889841             -2.0
18:                               Yeager    14.67164179             -1.5
19:                     Kansas City Intl    14.51405836              0.0
20:                          Albany Intl    14.39712919             -4.0
21:                General Mitchell Intl    14.16722038              0.0
22:                       Piedmont Triad    14.11260054             -2.0
23:               Washington Dulles Intl    13.86420212             -3.0
24:               Cherry Capital Airport    12.96842105            -10.0
25:              James M Cox Dayton Intl    12.68048606             -3.0
26:     Louisville International Airport    12.66938406             -2.0
27:                  Chicago Midway Intl    12.36422360             -1.0
28:                      Sacramento Intl    12.10992908              4.0
29:                    Jacksonville Intl    11.84483416             -2.0
30:                       Nashville Intl    11.81245891             -2.0
31:                Portland Intl Jetport    11.66040210             -4.0
32:               Greater Rochester Intl    11.56064461             -5.0
33:      Hartsfield Jackson Atlanta Intl    11.30011285             -1.0
34:                Lambert St Louis Intl    11.07846451             -3.0
35:                         Norfolk Intl    10.94909344             -4.0
36:            Baltimore Washington Intl    10.72673385             -5.0
37:                         Memphis Intl    10.64531435             -2.5
38:                   Port Columbus Intl    10.60132291             -3.0
39:                  Charleston Afb Intl    10.59296847             -4.0
40:                    Philadelphia Intl    10.12719014             -3.0
41:                  Raleigh Durham Intl    10.05238095             -3.0
42:                    Indianapolis Intl     9.94043412             -3.0
43:            Charlottesville-Albemarle     9.50000000             -5.0
44:               Cleveland Hopkins Intl     9.18161129             -5.0
45:        Ronald Reagan Washington Natl     9.06695204             -2.0
46:                      Burlington Intl     8.95099602             -4.0
47:                 Buffalo Niagara Intl     8.94595186             -5.0
48:                Syracuse Hancock Intl     8.90392501             -5.0
49:                          Denver Intl     8.60650021             -2.0
50:                      Palm Beach Intl     8.56297210             -3.0
51:                             Bob Hope     8.17567568             -3.0
52:       Fort Lauderdale Hollywood Intl     8.08212154             -3.0
53:                          Bangor Intl     8.02793296             -9.0
54:           Asheville Regional Airport     8.00383142             -1.0
55:                      Pittsburgh Intl     7.68099053             -5.0
56:                       Gallatin Field     7.60000000             -2.0
57:                 NW Arkansas Regional     7.46572581             -2.0
58:                           Tampa Intl     7.40852503             -4.0
59:               Charlotte Douglas Intl     7.36031885             -3.0
60:             Minneapolis St Paul Intl     7.27016886             -5.0
61:                      William P Hobby     7.17618819             -4.0
62:                         Bradley Intl     7.04854369            -10.0
63:                     San Antonio Intl     6.94537178             -9.0
64:                      South Bend Rgnl     6.50000000             -3.5
65:     Louis Armstrong New Orleans Intl     6.49017497             -6.0
66:                        Key West Intl     6.35294118              7.0
67:                        Eagle Co Rgnl     6.30434783             -4.0
68:                Austin Bergstrom Intl     6.01990875             -5.0
69:                   Chicago Ohare Intl     5.87661475             -8.0
70:                         Orlando Intl     5.45464309             -5.0
71:               Detroit Metro Wayne Co     5.42996346             -7.0
72:                        Portland Intl     5.14157973             -5.0
73:                        Nantucket Mem     4.85227273             -3.0
74:                      Wilmington Intl     4.63551402             -7.0
75:                    Myrtle Beach Intl     4.60344828            -13.0
76:    Albuquerque International Sunport     4.38188976             -5.5
77:         George Bush Intercontinental     4.24079040             -5.0
78:        Norman Y Mineta San Jose Intl     3.44817073             -7.0
79:               Southwest Florida Intl     3.23814963             -5.0
80:                       San Diego Intl     3.13916574             -5.0
81:              Sarasota Bradenton Intl     3.08243131             -5.0
82:            Metropolitan Oakland Intl     3.07766990             -9.0
83:   General Edward Lawrence Logan Intl     2.91439222             -9.0
84:                   San Francisco Intl     2.67289152             -8.0
85:                         Yampa Valley     2.14285714              2.0
86:              Phoenix Sky Harbor Intl     2.09704733             -6.0
87:            Montrose Regional Airport     1.78571429            -10.5
88:                     Los Angeles Intl     0.54711094             -7.0
89:               Dallas Fort Worth Intl     0.32212685             -9.0
90:                           Miami Intl     0.29905978             -9.0
91:                       Mc Carran Intl     0.25772849             -8.0
92:                  Salt Lake City Intl     0.17625459             -8.0
93:                           Long Beach    -0.06202723            -10.0
94:                Martha\\\\'s Vineyard    -0.28571429            -11.0
95:                  Seattle Tacoma Intl    -1.09909910            -11.0
96:                        Honolulu Intl    -1.36519258             -7.0
97:            John Wayne Arpt Orange Co    -7.86822660            -11.0
98:                    Palm Springs Intl   -12.72222222            -13.5
                                arr_name mean_arr_delay median_arr_delay
planes_dt <- as.data.table(planes)

# Calculate flight speed and join flights with planes
new_flights <- flights_dt[
  !is.na(air_time) & !is.na(distance), # Exclude rows with missing air_time or distance
  flight_speed := distance / (air_time / 60) # Calculate flight speed (MPH)
][
  planes_dt, on = "tailnum", nomatch = 0 # Join with planes dataset on tailnum
]

# Calculate average speed and flight count for each model
fastest_model <- new_flights[
  , .(
    avg_speed = mean(flight_speed, na.rm = TRUE), # Calculate average speed for the model
    flights_cnt = .N # Count the number of flights for the model
  ),
  by = model # Group by model
][
  order(-avg_speed) # Order by descending average speed
][
  1 # Select the row with the highest average speed
]

print(fastest_model)
     model avg_speed flights_cnt
    <char>     <num>       <int>
1: 777-222  482.6254           4